experiment_name <- "fveg_1-24_gb"
experiment_path <- here("data", "4_for_analysis", "ML_outputs",  "experiments", experiment_name)
dataset <- "fveg"
months_ts <- TRUE

ET over fallow lands vs predictions

# read in the fallow fields predictions
fallow <- fread(here(experiment_path, "fallow.csv"))
fallow[fallow == -9999] <- NA # this is the na value

if (months_ts){
  # get a nice column of numeric months
  months <- select(fallow, names(fallow)[grepl("month", names(fallow))])
  fallow$month <- names(months)[max.col(months)] 
  fallow$month <- str_extract(fallow$month, '(?<=_)\\d+') %>% as.numeric()
}

A scatterplot with colors by month

if (months_ts){
  # with different months
  r2 <- summary(lm(ET~ET_pred, data=fallow))$r.squared 
  Bias <- mean(fallow$ET_pred - fallow$ET, na.rm=TRUE)
  ggplot(fallow) +
    geom_jitter(aes(x=ET_pred, y=ET, color=as.factor(month)), alpha=0.2, size =.1) + 
    theme_classic() + 
    geom_abline(intercept=0,slope=1, color="red") + 
    # annotate("text", x=4, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(Bias, 3))) + 
    xlab("Predicted natural ET (mm/day)") + 
    ylab("Observed fallow ET (mm/day)") +
    xlim(c(0,6)) + 
    ylim(c(0,6))
  }
## Warning: Removed 242004 rows containing missing values (geom_point).

print(paste("R2:", round(r2, 3), "Bias:", round(Bias, 3)))
## [1] "R2: 0.512 Bias: 2.836"

A scatterplot with months averaged out

if (months_ts){
  # averaging over different months
  fallow_year <- fallow %>% group_by(x, y) %>% summarize(ET = mean(ET, na.rm=TRUE), 
                                                         ET_pred = mean(ET_pred, na.rm=TRUE))
} else {
  fallow_year <- fallow
}
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
r2 <- summary(lm(ET~ET_pred, data=fallow_year))$r.squared 
Bias <- mean(fallow_year$ET_pred - fallow_year$ET, na.rm=TRUE)
ggplot(fallow_year) +
  geom_jitter(aes(x=ET_pred, y=ET), alpha=0.2, size =.1) + 
  theme_classic() + 
  geom_abline(intercept=0,slope=1, color="red") + 
  # annotate("text", x=3, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(Bias, 3))) + 
  xlab("Predicted natural ET (mm/day)") + 
  ylab("Observed fallow ET (mm/day)") +
    xlim(c(0,4)) + 
    ylim(c(0,4))
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in min(diff(sort(x))): no non-missing arguments to min; returning Inf
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in stats::runif(length(x), -amount, amount): NAs produced
## Warning: Removed 20473 rows containing missing values (geom_point).

print(paste("R2:", round(r2, 3), "Bias:", round(Bias, 3)))
## [1] "R2: 0.534 Bias: 2.836"

A time series

if (months_ts){
  # plot out the time series
  fallow_ts <- fallow %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE), 
                                                         ET_pred = mean(ET_pred, na.rm=TRUE))
  
  ggplot(fallow_ts) + 
    geom_line(aes(x=month, y=ET), color="red") + 
    geom_line(aes(x=month, y=ET_pred), color="black") 
}

Here I just got curious about what ag ET vs fallow ET look like. Seems like fallow ET is probably artificially high.

# ag <- fread(here("data", "3_for_counterfactual", "agriculture", "agriculture.csv"))
# ag$month <- str_extract(ag$month, '(?<=_)\\d+') %>% as.numeric()
# 
# # plot out the time series
# ag_ts <- ag %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE))
# 
# ggplot(ag_ts) + 
#   geom_line(aes(x=month, y=ET), color="red") + 
#   geom_line(data=fallow_ts, aes(x=month, y=ET), color="black")

Spatial CV

# read in the spatial_CV predictions
cv <- fread(here(experiment_path, "crossval_predictions_test.csv"))
cv[cv == -9999] <- NA # this is the na value

if (months_ts){
  # get a nice column of numeric months
  months <- select(cv, names(cv)[grepl("month", names(cv))])
  cv$month <- names(months)[max.col(months)] 
  cv$month <- str_extract(cv$month, '(?<=_)\\d+') %>% as.numeric()
}

cv$mean_dist <- cv$fold_size/6 # the average distance between a held out point and the border to the hold out

map out the performance of the model as hold out sizes increase

if (months_ts){
  gridded_cv_metrics <- cv %>% 
    group_by(x, y, cv_fold, fold_size) %>%
    summarize(ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE)) %>%
    group_by(cv_fold, fold_size) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2), 
              Bias = mean(ET_pred - ET, na.rm = TRUE), 
              RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2), 
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE))
} else {
  gridded_cv_metrics <- cv %>%
    group_by(cv_fold, fold_size) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
              Bias = mean(ET_pred - ET, na.rm = TRUE),
              RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE),
              ET_pred = mean(ET_pred, na.rm = TRUE))
}
## `summarise()` has grouped output by 'x', 'y', 'cv_fold'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cv_fold'. You can override using the `.groups` argument.

Turn the cv fold back into something meaningful

# first turn "fold" into x and y coordinates
gridded_cv_metrics$x <- as.numeric(str_extract(gridded_cv_metrics$cv_fold, "[-.0-9]*(?=,)")) + gridded_cv_metrics$fold_size/200000
gridded_cv_metrics$y <- as.numeric(str_extract(gridded_cv_metrics$cv_fold, "(?<=,)[-.0-9]*")) + gridded_cv_metrics$fold_size/200000
# study area and counties for plotting
study_area <- st_read(study_area_loc) %>% st_transform(st_crs("+proj=longlat +datum=WGS84"))
## Reading layer `study_area' from data source 
##   `/home/waves/users/annaboser/ET_ag_disALEXI/data/1_raw/study_area/study_area.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 3 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -218673.3 ymin: -352229.6 xmax: 135464.2 ymax: 256511.5
## Projected CRS: NAD83 / California Albers
counties <- st_read(counties_loc) %>% filter(STATEFP == "06") %>% st_transform(st_crs("+proj=longlat +datum=WGS84"))
## Reading layer `cb_2018_us_county_500k' from data source 
##   `/home/waves/users/annaboser/ET_ag_disALEXI/data/1_raw/counties/cb_2018_us_county_500k.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.1489 ymin: -14.5487 xmax: 179.7785 ymax: 71.36516
## Geodetic CRS:  NAD83
gridded_cv_metrics$RMSE <- ifelse(gridded_cv_metrics$RMSE>1.5, 1.5, gridded_cv_metrics$RMSE)
gridded_cv_metrics$Bias <- ifelse(gridded_cv_metrics$Bias>1, 1, gridded_cv_metrics$Bias)
gridded_cv_metrics$Bias <- ifelse(gridded_cv_metrics$Bias<(-1), -1, gridded_cv_metrics$Bias)

filter(gridded_cv_metrics) %>%
  ggplot() + 
  geom_sf(data = counties, color=alpha("white",1), size = .2) + 
  geom_raster(aes(x=x, y=y, fill=Bias)) +
  facet_wrap(vars(fold_size/1000)) +
  scale_fill_distiller(palette="Spectral", direction = -1, limits = c(-1,1)) +
  geom_sf(data = study_area, fill=alpha("red",0), color = "black", size = .2) + 
  theme_classic() +
  xlim(c(-122.92, -118)) + 
  ylim(c(34.7, 40.754)) + 
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())
## Warning: Raster pixels are placed at uneven horizontal intervals and will be
## shifted. Consider using geom_tile() instead.
## Warning: Raster pixels are placed at uneven vertical intervals and will be
## shifted. Consider using geom_tile() instead.

filter(gridded_cv_metrics) %>%
  ggplot() + 
  geom_sf(data = counties, color=alpha("white",1), size = .2) + 
  geom_raster(aes(x=x, y=y, fill=RMSE)) +
  facet_wrap(vars(fold_size/1000)) +
  scale_fill_distiller(palette="Spectral", direction = -1, limits = c(0,1.5)) +
  geom_sf(data = study_area, fill=alpha("red",0), color = "black", size = .2) + 
  theme_classic() +
  xlim(c(-122.92, -118)) + 
  ylim(c(34.7, 40.754)) + 
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())
## Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted. Consider using geom_tile() instead.
## Raster pixels are placed at uneven vertical intervals and will be shifted. Consider using geom_tile() instead.

Plot out the performance of the model as hold out sizes get bigger relative to the distance between ag and natural training data

First get the distribution of distances between ag and natural data (specifically the test data)

dist_filename <- here(distance_distribution_path, paste0(dataset, ".csv"))

if (file.exists(dist_filename)){
  library(parallel)
  library(tictoc)
  
  dist <- fread(dist_filename)
} else {
  stop('No distance file for this dataset. Make one using 3_analysis/1_additional_data_manipulation/2_ag_natural_distance.R')
}
## 
## Attaching package: 'tictoc'
## The following object is masked from 'package:data.table':
## 
##     shift
if (months_ts){
  cv_metrics <- cv %>% 
    group_by(x, y, fold_size, mean_dist) %>%
    summarize(ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE)) %>%
    group_by(fold_size, mean_dist) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2), 
              Bias = mean(ET_pred - ET, na.rm = TRUE), 
              RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2), 
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE))
} else {
  cv_metrics <- cv %>%
    group_by(fold_size, mean_dist) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
              Bias = mean(ET_pred - ET, na.rm = TRUE),
              RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE),
              ET_pred = mean(ET_pred, na.rm = TRUE))
}
## `summarise()` has grouped output by 'x', 'y', 'fold_size'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'fold_size'. You can override using the `.groups` argument.
coef = 2/5

ggplot(NULL) + 
  stat_density(data = dist, aes(x=mindist/1000), fill = "grey10", alpha = .3) + 
  geom_line(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef), color = "red") +
  geom_point(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef), color = "red") +
  xlab(TeX("Distance of agricultural pixel to nearest natural pixel (km) \n Average distance of point to border of hold-out (km)")) + 
  theme_bw() + 
  theme(axis.title.x = element_text(vjust=-2.5)) +
  xlim(c(0, max(cv_metrics$mean_dist/1000))) + 
  scale_y_continuous(
    name = "Distribution of agricultural pixels",
    sec.axis = sec_axis(~./coef, name=TeX("Model performance ($R^{2}$)")))
## Warning: Removed 3 rows containing non-finite values (stat_density).

Plot the performance for the different months too

if (months_ts){
  cv_metrics <- cv %>% 
    group_by(fold_size, month, mean_dist) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2), 
              Bias = mean(ET_pred - ET, na.rm = TRUE), 
              RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2), 
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE))
  
  coef = 2/5
  
  ggplot(NULL) + 
    stat_density(data = dist, aes(x=mindist/1000), fill = "grey10", alpha = .3) + 
    geom_line(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef, color = as.factor(month))) +
    geom_point(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef, color = as.factor(month))) +
    xlab(TeX("Distance of agricultural pixel to nearest natural pixel (km) \n Average distance of point to border of hold-out (km)")) + 
    theme_bw() + 
    theme(axis.title.x = element_text(vjust=-2.5)) +
    xlim(c(0, max(cv_metrics$mean_dist/1000))) + 
    scale_y_continuous(
      name = "Distribution of agricultural pixels",
      sec.axis = sec_axis(~./coef, name=TeX("Model performance ($R^{2}$)")))
}
## `summarise()` has grouped output by 'fold_size', 'month'. You can override using
## the `.groups` argument.
## Warning: Removed 3 rows containing non-finite values (stat_density).

Some scatterplots and time series

Make these using the CV with the fold size that is closest to the double of the mean distance between ag and natural (a bit conservative but only slightly)

mean_dist <- mean(dist$mindist)
folds <- unique(cv$mean_dist)
best_fold <- folds[abs(folds-(mean_dist)) == min(abs(folds-(mean_dist)))]
cv_best_fold <- filter(cv, mean_dist == best_fold)
print(cv_best_fold$fold_size %>% unique())
## [1] 10000

A scatterplot with colors by month

if (months_ts){
  # with different months
  r2 <- summary(lm(ET~ET_pred, data=cv_best_fold))$r.squared 
  Bias <- mean(cv_best_fold$ET_pred - cv_best_fold$ET, na.rm=TRUE)
  ggplot(cv_best_fold) +
    geom_jitter(aes(x=ET_pred, y=ET, color=as.factor(month)), alpha=0.2, size =.1) + 
    theme_classic() + 
    geom_abline(intercept=0,slope=1, color="red") + 
    # annotate("text", x=4.5, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(Bias, 3))) + 
    xlab("Predicted natural ET (mm/day)") + 
    ylab("Observed fallow ET (mm/day)") +
    xlim(c(0,8)) + 
    ylim(c(0,8))
}
## Warning: Removed 37436196 rows containing missing values (geom_point).

print(paste("R2:", round(r2, 3), "Bias:", round(Bias, 3)))
## [1] "R2: 0.63 Bias: 0.055"

A scatterplot with months averaged out

if (months_ts){
  # averaging over different months
  cv_best_fold_year <- cv_best_fold %>% group_by(x, y) %>% summarize(ET = mean(ET, na.rm=TRUE), 
                                                         ET_pred = mean(ET_pred, na.rm=TRUE))
} else {
  cv_best_fold_year <- cv_best_fold
}
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
r2 <- summary(lm(ET~ET_pred, data=cv_best_fold_year))$r.squared 
Bias <- mean(cv_best_fold_year$ET_pred - cv_best_fold_year$ET, na.rm=TRUE)
ggplot(cv_best_fold_year) +
  geom_jitter(aes(x=ET_pred, y=ET), alpha=0.02, size =.01) + 
  theme_classic() + 
  geom_abline(intercept=0,slope=1, color="red") + 
  # annotate("text", x=4.5, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(Bias, 3))) + 
  xlab("Predicted natural ET (mm/day)") + 
  ylab("Observed fallow ET (mm/day)") +
    xlim(c(0,6)) + 
    ylim(c(0,6))
## Warning: Removed 3337080 rows containing missing values (geom_point).

print(paste("R2:", round(r2, 3), "Bias:", round(Bias, 3)))
## [1] "R2: 0.633 Bias: 0.055"

A time series

if (months_ts){
  # plot out the time series
  cv_best_fold_ts <- cv_best_fold %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE), 
                                                         ET_pred = mean(ET_pred, na.rm=TRUE))
  
  ggplot(cv_best_fold_ts) + 
    geom_line(aes(x=month, y=ET), color="red") + 
    geom_line(aes(x=month, y=ET_pred), color="black") + 
    geom_line(data = fallow_ts, aes(x=month, y=ET), color="blue") + 
    geom_line(data = fallow_ts, aes(x=month, y=ET_pred), color="green") 
}